In [None]:
import numpy as np
import pandas as pd

**Motivation**

- Analyzing a small representative data.
- Testing code on readable and known input.
- Detects over-fitting by training and testing on different subsets.
- Amplifies small datasets, and estimates a population and data variance (bootstrapping).

**Randomization**

In [None]:
# random float between 0 and 1
np.random.rand()

0.4800909970578978

In [None]:
# array of 3 random floats, between 0 and 1
np.random.rand(3)

array([0.36216338, 0.10815522, 0.34846727])

In [None]:
# array of 100 integers, within [1, 44] inclusive
np.random.randint(1, 44, 100),

(array([11, 40,  1, 15, 34, 14,  2, 22, 19, 31, 26, 23, 20, 28, 19,  3, 41,
        19, 23, 30, 28, 10, 23,  5, 22, 27, 30, 13, 32,  6, 39,  8, 18, 22,
        32,  9, 36,  5, 31, 34, 42, 14, 16, 24, 12, 19, 34, 33, 37, 16, 14,
         9, 17, 14, 14, 12, 12, 40, 27, 43, 41,  7, 18, 19, 40, 33,  1, 29,
        25, 10, 23, 43, 42, 21,  8, 17, 28, 27, 20, 11, 19, 18, 19, 10,  3,
        14,  5, 37, 27,  4, 34,  2, 40, 10,  3, 24, 26, 16, 36, 41]),)

In [None]:
# array of 10 random choices from the list ['X', 'Y', 'Z']
np.random.choice( ['X', 'Y', 'Z'], 10 )

array(['Z', 'Z', 'Y', 'Y', 'Z', 'Y', 'X', 'X', 'Y', 'Y'], dtype='<U1')

**Randomly Generated Dataframe**

In [None]:
df = pd.DataFrame({
    'A': np.random.randint(1, 100, 100),
    'B': np.random.rand(100),
    'C': np.random.choice(['X', 'Y', 'Z'], 100)
})

In [None]:
df.head(3)

Unnamed: 0,A,B,C
0,47,0.346712,Z
1,23,0.220037,X
2,78,0.114182,X


**Sampling from dataset**

In [None]:
# single row
df.sample()

Unnamed: 0,A,B,C
22,2,0.29071,X


In [None]:
# 5 rows
df.sample(n=5)

Unnamed: 0,A,B,C
57,88,0.97282,Y
60,40,0.828645,X
86,86,0.464698,X
93,26,0.050152,Y
78,95,0.883174,Y


In [None]:
# 10% of the dataframe
df.sample(frac=0.1)

Unnamed: 0,A,B,C
93,26,0.050152,Y
77,12,0.110856,Z
43,68,0.324398,Y
2,78,0.114182,X
65,38,0.259684,Y
98,7,0.801435,Y
50,65,0.34053,X
85,66,0.521788,Z
88,74,0.845292,Y
33,21,0.115648,Y


In [None]:
# first 5 records
df[:5]

Unnamed: 0,A,B,C
0,47,0.346712,Z
1,23,0.220037,X
2,78,0.114182,X
3,42,0.516399,Y
4,83,0.630134,Y


In [None]:
# dataframe except first 5 records
df[5:]

Unnamed: 0,A,B,C
5,98,0.065585,Y
6,45,0.308081,Z
7,11,0.756561,X
8,76,0.987339,Y
9,92,0.944758,Z
...,...,...,...
95,85,0.509100,Z
96,96,0.817113,Y
97,29,0.053948,Z
98,7,0.801435,Y


In [None]:
# first two-thirds
df[:int(2/3*len(df))]

Unnamed: 0,A,B,C
0,47,0.346712,Z
1,23,0.220037,X
2,78,0.114182,X
3,42,0.516399,Y
4,83,0.630134,Y
...,...,...,...
61,80,0.127827,X
62,35,0.232962,Z
63,28,0.417300,X
64,31,0.400992,Y


In [None]:
# last third
df[int(2/3*len(df)):]

Unnamed: 0,A,B,C
66,88,0.423409,Y
67,88,0.163039,Y
68,9,0.691206,Z
69,98,0.843983,Y
70,5,0.605099,Z
71,29,0.000336,Y
72,27,0.760821,Z
73,73,0.052839,Y
74,28,0.084699,Z
75,97,0.873854,X


In [None]:
# with replacement, generated 1000 sized from 100
df.sample(n=1000, replace=True)

Unnamed: 0,A,B,C
37,38,0.499139,Y
30,9,0.623434,Y
63,28,0.417300,X
66,88,0.423409,Y
56,19,0.065106,Z
...,...,...,...
70,5,0.605099,Z
31,74,0.247616,Y
52,49,0.780817,X
81,61,0.084555,Y


In [None]:
# Weighted sampling. Higher weight corresponds to higher probability of being in the sample
weights = np.random.rand(len(df))
df.sample(n=10, weights=weights)
# Example. Sampling products with varying sales quantities

Unnamed: 0,A,B,C
91,52,0.436112,Y
56,19,0.065106,Z
31,74,0.247616,Y
25,82,0.009539,Y
81,61,0.084555,Y
73,73,0.052839,Y
12,1,0.4396,X
70,5,0.605099,Z
74,28,0.084699,Z
62,35,0.232962,Z


**Sampling from numpy 2d arrays**

- Sample by pandas dataframe then convert to numpy, `df.to_numpy()`
- Convert the pandas dataframe to numpy then sample

In [None]:
X = np.random.rand(100, 3)  # Example features
y = np.random.rand(100) * 10 # Example labels

In [None]:
X

array([[0.69674725, 0.7929846 , 0.33732202],
       [0.94728717, 0.75968323, 0.83374163],
       [0.96525976, 0.97004061, 0.73297883],
       [0.87451195, 0.47103024, 0.59296529],
       [0.1464027 , 0.38733106, 0.4438067 ],
       [0.63945207, 0.86812327, 0.01909439],
       [0.74137846, 0.92031455, 0.0430259 ],
       [0.44753616, 0.23251219, 0.15960172],
       [0.01002027, 0.58795062, 0.52985319],
       [0.18417834, 0.5251731 , 0.6157952 ],
       [0.46205852, 0.08372088, 0.19805176],
       [0.10933042, 0.66146754, 0.28074646],
       [0.10894786, 0.7603271 , 0.11382542],
       [0.56078535, 0.08873593, 0.81978564],
       [0.03508123, 0.70415455, 0.03715475],
       [0.12846695, 0.49675803, 0.72844696],
       [0.69396137, 0.76934679, 0.4835339 ],
       [0.40802568, 0.89899254, 0.40768991],
       [0.29363673, 0.78813646, 0.45320463],
       [0.16386549, 0.41474865, 0.83137166],
       [0.6132869 , 0.610178  , 0.85869946],
       [0.39934549, 0.67779046, 0.99399398],
       [0.

In [None]:
y

array([2.49545348, 1.06747105, 2.9963589 , 7.26187602, 6.66507712,
       3.06765783, 2.53350371, 2.71604105, 0.40767457, 4.11685805,
       9.05231503, 0.69765083, 6.02375903, 5.46962292, 7.97359541,
       8.09758432, 4.89203856, 3.20607904, 2.34553287, 9.82903253,
       0.2621172 , 0.16848334, 4.95761748, 7.23485349, 4.10852275,
       1.49389953, 5.28651934, 3.52679844, 2.60076113, 4.1210557 ,
       9.38671635, 1.26869845, 1.64335894, 9.03373292, 8.0559083 ,
       9.20326402, 0.56378666, 8.95268255, 8.86061618, 7.59237246,
       5.92587188, 9.08246502, 2.79692668, 9.78816283, 7.91690885,
       8.61930904, 1.45792758, 6.94428132, 1.84188791, 1.84167408,
       2.89710339, 9.20147645, 5.14569256, 1.64875091, 3.36539841,
       2.21405416, 1.21957189, 2.38204942, 4.04843599, 9.35188615,
       7.05216429, 8.16082431, 0.59830995, 9.82021123, 3.88437365,
       1.66390912, 5.16676035, 9.57067806, 4.55817502, 9.21408677,
       5.56323454, 5.27365783, 7.36014523, 6.31834619, 2.15771

In [None]:
len(X)
# integer

100

In [None]:
np.arange( len(X) )

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [None]:
# from the array np.arange( len(X) ), [0, 1, .., len(X)-1], Select 3 random elements, no replacement
np.random.choice( len(X), size = 3, replace = False)

array([96, 56, 75])

In [None]:
# from the array np.arange( len(X) ), [0, 1, .., len(X)-1], Select 3 random elements, with replacement
np.random.choice( len(X), size = 10, replace = True)

array([18, 29, 15, 15, 98, 75, 65, 10, 40, 75])

In [None]:
# from the list ['A', 'B', 'C'], Select 3 random elements, with replacement,
# where A with probabilty 0, B with probabilty 0, and C with probability 1
np.random.choice( ['A', 'B', 'C'], size = 5, replace = True, p = [0, 0, 1])

array(['C', 'C', 'C', 'C', 'C'], dtype='<U1')

In [None]:
# select random one-third indices from X
indices = np.random.choice( len(X), size = int(len(X)/3), replace = False)
indices

array([ 0,  6, 93, 66, 52, 72, 86, 54, 78,  9, 10, 51, 11, 23, 12, 77, 28,
       48, 57, 71, 29, 74, 42, 59, 26, 46, 31, 16, 49, 96, 92, 53, 45])

In [None]:
# elements of X corresponding to the indices
X[indices]

array([[0.69674725, 0.7929846 , 0.33732202],
       [0.74137846, 0.92031455, 0.0430259 ],
       [0.23589699, 0.1105653 , 0.98033187],
       [0.36840342, 0.34833846, 0.40009695],
       [0.75495846, 0.06345878, 0.72197669],
       [0.76139056, 0.3896073 , 0.6576453 ],
       [0.83269956, 0.42092346, 0.07419493],
       [0.45711899, 0.25227027, 0.21388747],
       [0.34948616, 0.5477866 , 0.62677671],
       [0.18417834, 0.5251731 , 0.6157952 ],
       [0.46205852, 0.08372088, 0.19805176],
       [0.28923484, 0.24703255, 0.28904762],
       [0.10933042, 0.66146754, 0.28074646],
       [0.5579676 , 0.71202531, 0.96446214],
       [0.10894786, 0.7603271 , 0.11382542],
       [0.2287668 , 0.52309816, 0.85133831],
       [0.50217347, 0.49020153, 0.42110574],
       [0.90316517, 0.33747304, 0.12941088],
       [0.03952864, 0.26744231, 0.35613494],
       [0.84011545, 0.20644057, 0.87265142],
       [0.56390149, 0.57315548, 0.81417645],
       [0.17902764, 0.44716188, 0.43298573],
       [0.

In [None]:
# elements of y corresponding to the indices
y[indices]

array([2.71295602, 2.41685495, 0.42433087, 1.22525619, 1.81554192,
       8.71053627, 5.31037545, 8.31231702, 6.3310664 , 9.37723959,
       6.2914701 , 2.75358706, 5.5080891 , 5.26471927, 8.11995942,
       1.79465658, 9.34949533, 8.22067461, 6.84249542, 6.03691533,
       6.99485292, 9.25788105, 2.11798628, 4.54932669, 9.54837229,
       2.02636238, 0.66390873, 3.76736823, 4.05307367, 9.91324551,
       9.27384121, 7.24043694, 5.72671105])

#### Bonus

- Amplify a small data's mean by the bootstrapping procedure:

```
- Set the number of bootstrap samples B.
- Initialize an empty list to store the means of bootstrap samples.
- For each bootstrap sample, out of total B:
    - Generate a bootstrap sample S_b from the original sample data S by sampling with replacement.
    - Calculate the mean of the bootstrap sample S_b.
    - Store the mean in the list of bootstrap sample means.
- Calculate the estimated mean of the sample data by taking the mean of the list of bootstrap sample means.
```

- Experiment with different small data's with varying points distances from the mean. Compare the original data's mean with the bootstrapped mean.